From: Tobi Oetiker tobi{at}oetiker.ch
Date: 2007-04-23

I am using rsync for hard-link backup. I found that there is a
major problem with frequent backup filling up the file system cache
with all the data from the files being backed up. The effect is
that all the other 'sensible' data in the cache gets thrown out in
the process. This is rather unfortunate as the performance of the
system becomes very bad after running rsync.

Some research showed, that

  posix_fadvise64(fd, 0, 0,POSIX_FADV_DONTNEED);

would tell the OS that it should  not keep the file in cache. I
have written a patch for rsync that adds the

  --drop-cache

option which activates posix_fadvise64.

There are some caveats though:

  * When calling posix_fadvise64 while writing a file, only the
    part of the cache will be release which has already been
    written to disk. This means we have to call fdatasync before
    calling posix_fadvise64 and this will unfortunately slow down
    operations considerably. On my test system I get 240 KByte/s.

    The patch has been optimized, so that the impact on large files
    will be considerably lowered by calling posix_fadvise64 only
    after a few megabytes have been written.

  * When reading a file which has been cached *Before* rsync read
    it, the content of the file will be released from cache never
    the less, which may not be intended. I have unfortunately not
    found a method for determining if a file is in cache or not
    (ideas?)

    I found that running rsync of an lvm snapshot is a good way
    around this problem, since the snapshot data is cached
    separately from the original. It has the additional benefit of
    making the backups more consistent.

  * I don't really know the rsync code, so it may be that the patch
    is calling fadvise for files where this would not be necessary.

  * The patch is tested only on Linux 2.6.18

If you have any input on this, please let me know.

You can get the latest edition of the patch from

  http://tobi.oetiker.ch/patches/

cheers
tobi

Changes:

 2007-04-23

* pass --drop-cache on to the remote server
* make test works now

based-on: 1ddcdaf3f6808ba53aef9e19f630a18808de22ac
diff --git a/checksum.c b/checksum.c
--- a/checksum.c
+++ b/checksum.c
@@ -24,6 +24,10 @@
 extern int checksum_seed;
 extern int protocol_version;
 
+#ifdef HAVE_POSIX_FADVISE64
+#define close(fd) fadv_close(fd)
+#endif
+
 /*
   a simple 32 bit checksum that can be upadted from either end
   (inspired by Mark Adler's Adler-32 checksum)
diff --git a/cleanup.c b/cleanup.c
--- a/cleanup.c
+++ b/cleanup.c
@@ -47,7 +47,13 @@ void close_all(void)
 	int fd;
 	int ret;
 	STRUCT_STAT st;
+#endif
+
+#ifdef HAVE_POSIX_FADVISE64
+	fadv_close_all();
+#endif
 
+#ifdef SHUTDOWN_ALL_SOCKETS
 	max_fd = sysconf(_SC_OPEN_MAX) - 1;
 	for (fd = max_fd; fd >= 0; fd--) {
 		if ((ret = do_fstat(fd, &st)) == 0) {
diff --git a/configure.ac b/configure.ac
--- a/configure.ac
+++ b/configure.ac
@@ -570,6 +570,7 @@ AC_FUNC_ALLOCA
 AC_CHECK_FUNCS(waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \
     fchmod fstat ftruncate strchr readlink link utime utimes lutimes strftime \
     memmove lchown vsnprintf snprintf vasprintf asprintf setsid strpbrk \
+    posix_fadvise64 \
     strlcat strlcpy strtol mallinfo getgroups setgroups geteuid getegid \
     setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \
     seteuid strerror putenv iconv_open locale_charset nl_langinfo getxattr \
diff --git a/fileio.c b/fileio.c
--- a/fileio.c
+++ b/fileio.c
@@ -29,6 +29,12 @@ extern int sparse_files;
 
 static OFF_T sparse_seek = 0;
 
+#ifdef HAVE_POSIX_FADVISE64
+#define close(fd) fadv_close(fd)
+#define read(fd,buf,len) fadv_read(fd,buf,len)
+#define write(fd,buf,len) fadv_write(fd,buf,len)
+#endif
+
 int sparse_end(int f, OFF_T size)
 {
 	int ret;
diff --git a/generator.c b/generator.c
--- a/generator.c
+++ b/generator.c
@@ -113,6 +113,10 @@ static int need_retouch_dir_times;
 static int need_retouch_dir_perms;
 static const char *solo_file = NULL;
 
+#ifdef HAVE_POSIX_FADVISE64
+#define close(fd) fadv_close(fd)
+#endif
+
 /* For calling delete_item() and delete_dir_contents(). */
 #define DEL_NO_UID_WRITE 	(1<<0) /* file/dir has our uid w/o write perm */
 #define DEL_RECURSE		(1<<1) /* if dir, delete all contents */
diff --git a/options.c b/options.c
--- a/options.c
+++ b/options.c
@@ -60,6 +60,7 @@ int preserve_uid = 0;
 int preserve_gid = 0;
 int preserve_times = 0;
 int update_only = 0;
+int drop_cache = 0;
 int cvs_exclude = 0;
 int dry_run = 0;
 int do_xfers = 1;
@@ -325,6 +326,9 @@ void usage(enum logcode F)
   rprintf(F,"     --backup-dir=DIR        make backups into hierarchy based in DIR\n");
   rprintf(F,"     --suffix=SUFFIX         set backup suffix (default %s w/o --backup-dir)\n",BACKUP_SUFFIX);
   rprintf(F," -u, --update                skip files that are newer on the receiver\n");
+#ifdef HAVE_POSIX_FADVISE64
+  rprintf(F,"     --drop-cache            tell OS to drop caching of file data\n");
+#endif
   rprintf(F,"     --inplace               update destination files in-place (SEE MAN PAGE)\n");
   rprintf(F,"     --append                append data onto shorter files\n");
   rprintf(F,"     --append-verify         like --append, but with old data in file checksum\n");
@@ -533,6 +537,9 @@ static struct poptOption long_options[] = {
   {"no-one-file-system",0, POPT_ARG_VAL,    &one_file_system, 0, 0, 0 },
   {"no-x",             0,  POPT_ARG_VAL,    &one_file_system, 0, 0, 0 },
   {"update",          'u', POPT_ARG_NONE,   &update_only, 0, 0, 0 },
+#ifdef HAVE_POSIX_FADVISE64
+  {"drop-cache",       0,  POPT_ARG_NONE,   &drop_cache, 0, 0, 0 },
+#endif
   {"existing",         0,  POPT_ARG_NONE,   &ignore_non_existing, 0, 0, 0 },
   {"ignore-non-existing",0,POPT_ARG_NONE,   &ignore_non_existing, 0, 0, 0 },
   {"ignore-existing",  0,  POPT_ARG_NONE,   &ignore_existing, 0, 0, 0 },
@@ -1730,6 +1737,11 @@ void server_options(char **args, int *argc_p)
 	if (!am_sender)
 		args[ac++] = "--sender";
 
+#ifdef HAVE_POSIX_FADVISE64
+	if (drop_cache)
+		args[ac++] = "--drop-cache";
+#endif
+
 	x = 1;
 	argstr[0] = '-';
 
diff --git a/receiver.c b/receiver.c
--- a/receiver.c
+++ b/receiver.c
@@ -64,6 +64,10 @@ static flist_ndx_list batch_redo_list;
 /* We're either updating the basis file or an identical copy: */
 static int updating_basis_or_equiv;
 
+#ifdef HAVE_POSIX_FADVISE64
+#define close(fd) fadv_close(fd)
+#endif
+
 #define TMPNAME_SUFFIX ".XXXXXX"
 #define TMPNAME_SUFFIX_LEN ((int)sizeof TMPNAME_SUFFIX - 1)
 
diff --git a/rsync.yo b/rsync.yo
--- a/rsync.yo
+++ b/rsync.yo
@@ -356,6 +356,7 @@ to the detailed description below for a complete description.  verb(
      --super                 receiver attempts super-user activities
      --fake-super            store/recover privileged attrs using xattrs
  -S, --sparse                handle sparse files efficiently
+     --drop-cache            tell OS to drop caching of file data
  -n, --dry-run               perform a trial run with no changes made
  -W, --whole-file            copy files whole (w/o delta-xfer algorithm)
  -x, --one-file-system       don't cross filesystem boundaries
@@ -1103,6 +1104,10 @@ dit(bf(-S, --sparse)) Try to handle sparse files efficiently so they take
 up less space on the destination.  Conflicts with bf(--inplace) because it's
 not possible to overwrite data in a sparse fashion.
 
+dit(bf(--drop-cache)) Tell the OS to drop the caching of the file data.  This
+prevents rsync from filling up the filesystem cache.  This can sometimes help
+to make a system perform better by keeping non-rsync files in the disk cache.
+
 dit(bf(-n, --dry-run)) This makes rsync perform a trial run that doesn't
 make any changes (and produces mostly the same output as a real run).  It
 is most commonly used in combination with the bf(-v, --verbose) and/or
diff --git a/sender.c b/sender.c
--- a/sender.c
+++ b/sender.c
@@ -45,6 +45,10 @@ extern int write_batch;
 extern struct stats stats;
 extern struct file_list *cur_flist, *first_flist, *dir_flist;
 
+#ifdef HAVE_POSIX_FADVISE64
+#define close(fd) fadv_close(fd)
+#endif
+
 /**
  * @file
  *
diff --git a/t_unsafe.c b/t_unsafe.c
--- a/t_unsafe.c
+++ b/t_unsafe.c
@@ -28,6 +28,7 @@ int am_root = 0;
 int read_only = 0;
 int list_only = 0;
 int verbose = 0;
+int drop_cache = 0;
 int preserve_perms = 0;
 int preserve_executability = 0;
 
diff --git a/util.c b/util.c
--- a/util.c
+++ b/util.c
@@ -25,6 +25,7 @@
 
 extern int verbose;
 extern int module_id;
+extern int drop_cache;
 extern int modify_window;
 extern int relative_paths;
 extern int preserve_times;
@@ -42,6 +43,131 @@ char curr_dir[MAXPATHLEN];
 unsigned int curr_dir_len;
 int curr_dir_depth; /* This is only set for a sanitizing daemon. */
 
+#ifdef HAVE_POSIX_FADVISE64
+#define FADV_BUFFER_SIZE  1024*1024*16
+
+static struct stat fadv_fd_stat[1024];
+static off_t fadv_fd_pos[1024];
+static int fadv_fd_init = 0;
+static int fadv_max_fd = 0;
+static int fadv_close_ring_tail = 0;
+static int fadv_close_ring_head = 0;
+static int fadv_close_ring_size = 0;
+static int fadv_close_ring[1024];
+static int fadv_close_buffer_size = 0;
+
+static void fadv_fd_init_func(void)
+{
+	if (fadv_fd_init == 0) {
+		int i;
+		fadv_fd_init = 1;
+		if (fadv_max_fd == 0){
+			fadv_max_fd = sysconf(_SC_OPEN_MAX) - 20;
+			if (fadv_max_fd < 0)
+				fadv_max_fd = 1;
+			if (fadv_max_fd > 1000)
+				fadv_max_fd = 1000;
+		}
+		for (i = 0; i < fadv_max_fd; i++) {
+			fadv_fd_pos[i] = 0;
+			fadv_fd_stat[i].st_dev = 0;
+			fadv_fd_stat[i].st_ino = 0;
+		}
+	}
+}
+
+static void fadv_drop(int fd, int sync)
+{
+	struct stat sb;
+	int pos;
+
+	/* Trail 1 MB behind in dropping. we do this to make
+	 * sure that the same block or stripe does not have
+	 * to be written twice. */
+	if (fd > fadv_max_fd)
+		return;
+	pos = lseek(fd, 0, SEEK_CUR) - 1024*1024;
+	fadv_fd_init_func();
+	fstat(fd, &sb);
+	if (fadv_fd_stat[fd].st_dev == sb.st_dev
+	 && fadv_fd_stat[fd].st_ino == sb.st_ino) {
+		if (fadv_fd_pos[fd] < pos - FADV_BUFFER_SIZE) {
+			if (sync) {
+				/* If the file is not flushed to disk before calling fadvise,
+				 * then the Cache will not be freed and the advise gets ignored
+				 * this does give a severe hit on performance. If only there
+				 * was a way to mark cache so that it gets release once the data
+				 * is written to disk. */
+				fdatasync(fd);
+			}
+			posix_fadvise64(fd, 0, pos, POSIX_FADV_DONTNEED);
+			fadv_fd_pos[fd] = pos;
+		}
+	} else {
+		fadv_fd_stat[fd].st_dev = sb.st_dev;
+		fadv_fd_stat[fd].st_ino = sb.st_ino;
+		fadv_fd_pos[fd] = 0;
+	}
+}
+
+ssize_t fadv_write(int fd, const void *buf, size_t count)
+{
+	int ret = write(fd, buf, count);
+	if (drop_cache)
+		fadv_drop(fd, 1);
+	return ret;
+}
+
+ssize_t fadv_read(int fd, void *buf, size_t count)
+{
+	int ret = read(fd, buf, count);
+	if (drop_cache)
+		fadv_drop(fd, 0);
+	return ret;
+}
+
+void fadv_close_all(void)
+{
+	while (fadv_close_ring_size > 0){
+		fdatasync(fadv_close_ring[fadv_close_ring_tail]);
+		posix_fadvise64(fadv_close_ring[fadv_close_ring_tail], 0, 0,POSIX_FADV_DONTNEED);
+		fadv_close_ring_size--;
+		close(fadv_close_ring[fadv_close_ring_tail]);
+		fadv_close_ring_tail = (fadv_close_ring_tail + 1) % fadv_max_fd;
+		fadv_close_buffer_size = 0;
+	}
+}
+
+int fadv_close(int fd)
+{
+	if (drop_cache) {
+		/* If the file is not flushed to disk before calling fadvise,
+		 * then the Cache will not be freed and the advise gets ignored
+		 * this does give a severe hit on performance. So instead of doing
+		 * it right away, we save us a copy of the filehandle and do it
+		 * some time before we are out of filehandles. This speeds
+		 * up operation for small files massively. It is directly
+		 * related to the number of spare file handles you have. */
+		int newfd = dup(fd);
+		int pos = lseek(fd, 0, SEEK_CUR);
+		fadv_fd_init_func();
+		fadv_close_buffer_size += pos - fadv_fd_pos[fd];
+		fadv_close_ring[fadv_close_ring_head] = newfd;
+		fadv_close_ring_head = (fadv_close_ring_head + 1) % fadv_max_fd;
+		fadv_close_ring_size ++;
+		if (fadv_close_ring_size == fadv_max_fd || fadv_close_buffer_size > 1024*1024 ){
+			/* it seems fastest to drop things 'in groups' */
+			fadv_close_all();
+		}
+	}
+	return close(fd);
+}
+
+#define close(fd) fadv_close(fd)
+#define read(fd,buf,len) fadv_read(fd,buf,len)
+#define write(fd,buf,len) fadv_write(fd,buf,len)
+#endif
+
 /* Set a fd into nonblocking mode. */
 void set_nonblocking(int fd)
 {
diff -up a/config.h.in b/config.h.in
--- a/config.h.in
+++ b/config.h.in
@@ -292,6 +292,9 @@
 /* true if you have posix ACLs */
 #undef HAVE_POSIX_ACLS
 
+/* Define to 1 if you have the `posix_fadvise64' function. */
+#undef HAVE_POSIX_FADVISE64
+
 /* Define to 1 if you have the `putenv' function. */
 #undef HAVE_PUTENV
 
diff -up a/configure.sh b/configure.sh
--- a/configure.sh
+++ b/configure.sh
@@ -7426,6 +7426,7 @@ fi
 for ac_func in waitpid wait4 getcwd strdup chown chmod lchmod mknod mkfifo \
     fchmod fstat ftruncate strchr readlink link utime utimes lutimes strftime \
     memmove lchown vsnprintf snprintf vasprintf asprintf setsid strpbrk \
+    posix_fadvise64 \
     strlcat strlcpy strtol mallinfo getgroups setgroups geteuid getegid \
     setlocale setmode open64 lseek64 mkstemp64 mtrace va_copy __va_copy \
     seteuid strerror putenv iconv_open locale_charset nl_langinfo getxattr \
diff -up a/proto.h b/proto.h
--- a/proto.h
+++ b/proto.h
@@ -328,6 +328,10 @@ void send_id_list(int f);
 uid_t recv_user_name(int f, uid_t uid);
 gid_t recv_group_name(int f, gid_t gid, uint16 *flags_ptr);
 void recv_id_list(int f, struct file_list *flist);
+ssize_t fadv_write(int fd, const void *buf, size_t count);
+ssize_t fadv_read(int fd, void *buf, size_t count);
+void fadv_close_all(void);
+int fadv_close(int fd);
 void set_nonblocking(int fd);
 void set_blocking(int fd);
 int fd_pair(int fd[2]);
diff -up a/rsync.1 b/rsync.1
--- a/rsync.1
+++ b/rsync.1
@@ -431,6 +431,7 @@ to the detailed description below for a
      \-\-super                 receiver attempts super\-user activities
      \-\-fake\-super            store/recover privileged attrs using xattrs
  \-S, \-\-sparse                handle sparse files efficiently
+     \-\-drop\-cache            tell OS to drop caching of file data
  \-n, \-\-dry\-run               perform a trial run with no changes made
  \-W, \-\-whole\-file            copy files whole (w/o delta\-xfer algorithm)
  \-x, \-\-one\-file\-system       don'\&t cross filesystem boundaries
@@ -1272,6 +1273,11 @@ Try to handle sparse files efficiently s
 up less space on the destination.  Conflicts with \fB\-\-inplace\fP because it\(cq\&s
 not possible to overwrite data in a sparse fashion.
 .IP 
+.IP "\fB\-\-drop\-cache\fP"
+Tell the OS to drop the caching of the file data.  This
+prevents rsync from filling up the filesystem cache.  This can sometimes help
+to make a system perform better by keeping non\-rsync files in the disk cache.
+.IP 
 .IP "\fB\-n, \-\-dry\-run\fP"
 This makes rsync perform a trial run that doesn\(cq\&t
 make any changes (and produces mostly the same output as a real run).  It
